-
-
Notifications
You must be signed in to change notification settings - Fork 33.1k
gh-139772: Add PyDict_NewPresized() function #139773
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
I convert this PR to a draft for now since it seems like the API is misused by 3rd party projects, and I proposed |
eb555c6
to
8bb9715
Compare
8bb9715
to
8a61f5a
Compare
I rewrote the PR to add unicode_keys parameters: |
There are two news entries. |
Benchmark on PyDict_New() vs PyDict_NewPresized() with Unicode keys:
Benchmark hidden because not significant (1): dict-1 Code: diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index 4e73be20e1b..a1eaed01178 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -2562,6 +2562,77 @@ toggle_reftrace_printer(PyObject *ob, PyObject *arg)
Py_RETURN_NONE;
}
+
+static PyObject *
+bench_dict_new(PyObject *ob, PyObject *args)
+{
+ Py_ssize_t size, loops;
+ if (!PyArg_ParseTuple(args, "nn", &size, &loops)) {
+ return NULL;
+ }
+
+ PyTime_t t1, t2;
+ PyTime_PerfCounterRaw(&t1);
+ for (Py_ssize_t loop=0; loop < loops; loop++) {
+ PyObject *d = PyDict_New();
+ if (d == NULL) {
+ return NULL;
+ }
+
+ for (Py_ssize_t i=0; i < size; i++) {
+ PyObject *key = PyUnicode_FromFormat("%zi", i);
+ assert(key != NULL);
+
+ PyObject *value = PyLong_FromLong(i);
+ assert(value != NULL);
+
+ assert(PyDict_SetItem(d, key, value) == 0);
+ }
+
+ assert(PyDict_Size(d) == size);
+ Py_DECREF(d);
+ }
+ PyTime_PerfCounterRaw(&t2);
+
+ return PyFloat_FromDouble(PyTime_AsSecondsDouble(t2 - t1));
+}
+
+
+static PyObject *
+bench_dict_presized(PyObject *ob, PyObject *args)
+{
+ Py_ssize_t size, loops;
+ if (!PyArg_ParseTuple(args, "nn", &size, &loops)) {
+ return NULL;
+ }
+
+ PyTime_t t1, t2;
+ PyTime_PerfCounterRaw(&t1);
+ for (Py_ssize_t loop=0; loop < loops; loop++) {
+ PyObject *d = PyDict_NewPresized(size, 1);
+ if (d == NULL) {
+ return NULL;
+ }
+
+ for (Py_ssize_t i=0; i < size; i++) {
+ PyObject *key = PyUnicode_FromFormat("%zi", i);
+ assert(key != NULL);
+
+ PyObject *value = PyLong_FromLong(i);
+ assert(value != NULL);
+
+ assert(PyDict_SetItem(d, key, value) == 0);
+ }
+
+ assert(PyDict_Size(d) == size);
+ Py_DECREF(d);
+ }
+ PyTime_PerfCounterRaw(&t2);
+
+ return PyFloat_FromDouble(PyTime_AsSecondsDouble(t2 - t1));
+}
+
+
static PyMethodDef TestMethods[] = {
{"set_errno", set_errno, METH_VARARGS},
{"test_config", test_config, METH_NOARGS},
@@ -2656,6 +2727,8 @@ static PyMethodDef TestMethods[] = {
{"test_atexit", test_atexit, METH_NOARGS},
{"code_offset_to_line", _PyCFunction_CAST(code_offset_to_line), METH_FASTCALL},
{"toggle_reftrace_printer", toggle_reftrace_printer, METH_O},
+ {"bench_dict_new", bench_dict_new, METH_VARARGS},
+ {"bench_dict_presized", bench_dict_presized, METH_VARARGS},
{NULL, NULL} /* sentinel */
};
import pyperf
import functools
import _testcapi
runner = pyperf.Runner()
for size in (1, 10, 100, 1_000, 10_000):
func = functools.partial(_testcapi.bench_dict_new, size)
runner.bench_time_func(f'dict-{size:,}', func)
import pyperf
import functools
import _testcapi
runner = pyperf.Runner()
for size in (1, 10, 100, 1_000, 10_000):
func = functools.partial(_testcapi.bench_dict_presized, size)
runner.bench_time_func(f'dict-{size:,}', func) |
I created capi-workgroup/decisions#80 to the C API Working Group for this API. |
Benchmark on
|
This seems useful to me for PyO3 👍 I am unsure how reliably we will be able to use the |
Correct. If you know your input data, you can set the unicode_keys hint in advance, before consuming the iterator. You can use If you don't know your input data, you might need to consume the iterator and store keys and values in a temporary array, and then call |
I think this seems the wrong way around for me as a user; if I don't know my input data I'd rather not collect it to a temporary array, it could be a large dataset which would be a big temporary allocation. If I know the input data, I was thinking I would even be able to allocate the items in stack memory before calling |
Or are you saying that it is more efficient to use |
Oh, I don't know which function is faster. So I ran benchmarks: #139963 (comment). |
📚 Documentation preview 📚: https://cpython-previews--139773.org.readthedocs.build/